# title:   Presidential Candidates Nobody Wants?
# content: script to reproduce results reported in the main text of the paper
# author:  Zoltan Fazekas
# note:    session info included in a separate document
#          figure and table saving is commented out
#          assumes data file to be located in the same folder as script


# Packages and data -------------------------------------------------------
library("tidyverse")
library("tidytext")
library("quanteda")
library("gridExtra")
library("texreg")

load(file = "us-presidents.Rdata")

# create variables (for plots) with better labeling
us <- us |> 
  mutate(
    dem_rep_labs = case_when(
      pid7 < 4  ~ "Self-reported Democrats",
      pid7 == 4 ~ "Self-reported Independents",
      pid7 > 4 & pid7 < 8 ~ "Self-reported Republicans"
    ),
    vote_labs = case_when(
      presvote16 == 2 ~ "Will vote Donald Trump",
      presvote16 == 1 ~ "Will vote Hillary Clinton",
    ),
    primary_labs = case_when(
      demprimary2016 == 1 ~ "Voted in primary for H Clinton",
      demprimary2016 != 1 & demprimary2016 < 97 ~ "Voted in primary for Other Democrat",
      repprimary2016 == 16 ~ "Voted in primary for D Trump",
      repprimary2016 != 16 & repprimary2016 < 97 ~ "Voted in primary for Other Republican"
    )
  )

# Open-ended preferences --------------------------------------------------

# create Figure 1
f1a_data <- us |> 
  filter(alter_category != "NAP") |> # remove NAP (missing)
  mutate(alter_category = ifelse(alter_category == "Politician", 
                                 "Other politician*", alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Candidate", 
                                 "Either candidate\n(HRC/DT)", alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Primary", 
                                 "Other primary candidate", alter_category)) |> 
  
  group_by(alter_category) |> 
  summarise(p = n()/sum(us$alter_category != "NAP")) |> 
  arrange(desc(p)) |> 
  mutate(what = "1. Original")

f1b_data <- us |> 
  filter(alter_category != "NAP") |> 
  mutate(alter_category = ifelse(alter_category == "Politician" | 
                                   alter_category == "Primary", 
                                 "Other politician/\nOther primary candidate*",
                                 alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Candidate", 
                                 "Either candidate\n(HRC/DT)", alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Do not know" | 
                                   alter_category == "Personal" |
                                   alter_category == "Other" | 
                                   alter_category == "Deity/Fictional" |
                                   alter_category == "Nobody" | 
                                   alter_category == "Not candidates", 
                                 "Other (collapse)", alter_category)) |> 
  
  group_by(alter_category) |> 
  summarise(p = n()/sum(us$alter_category != "NAP")) |> 
  arrange(desc(p)) |> 
  mutate(what = "2. Grouped")

f1ab_data <- bind_rows(f1a_data, f1b_data)

f1ab_fig <- ggplot(f1ab_data, aes(x = reorder(alter_category, p), y = p, ymin = 0,
                     ymax = p)) +
  geom_linerange(linewidth = 1, colour = "grey70") + 
  geom_linerange(data = filter(f1ab_data, 
                               alter_category == "Either candidate\n(HRC/DT)"),
                 colour = "black", linewidth = 1) +
  geom_point(size = 1.5, colour = "grey70") +
  geom_label(data = filter(f1ab_data, 
                           alter_category == "Either candidate\n(HRC/DT)"),
             aes(label = round(100*p)), fill = "grey70") +
  coord_flip() +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~what, scales = "free") +
  labs(x = "", y = "",
       title = "80% of the public preferred someone else than the two main candidates") +
  theme_minimal()

f1c_data <- us |> 
  filter(alter_category != "NAP") |> 
  mutate(alter_category = ifelse(alter_category == "Politician" | 
                                   alter_category == "Primary", 
                                 "Other politician/\nOther primary candidate*", 
                                 alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Candidate", 
                                 "Either candidate\n(HRC/DT)", alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Do not know" | 
                                   alter_category == "Personal" |
                                   alter_category == "Other" | 
                                   alter_category == "Deity/Fictional" |
                                   alter_category == "Nobody" | 
                                   alter_category == "Not candidates", 
                                 "Other (collapse)", alter_category)) |> 
  drop_na(dem_rep_labs) |> 
  group_by(dem_rep_labs) |> 
  mutate(pid_n = n()) |> 
  ungroup() |> 
  group_by(alter_category, dem_rep_labs) |> 
  reframe(p = n()/unique(pid_n)) |> 
  arrange(desc(p))

f1c_fig <- ggplot(f1c_data, aes(x = reorder(alter_category, p), y = p, ymin = 0,
                                ymax = p)) +
  geom_linerange(linewidth = 1, colour = "grey70") + 
  geom_linerange(data = filter(f1c_data, 
                               alter_category == "Either candidate\n(HRC/DT)"),
                 colour = "black", linewidth = 1) +
  geom_point(size = 1.5, colour = "grey70") +
  geom_label(data = filter(f1c_data, 
                           alter_category == "Either candidate\n(HRC/DT)"),
             aes(label = round(100*p)), fill = "grey70") +
  coord_flip() +
  facet_wrap(~dem_rep_labs, ncol = 3) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "", y = "",
       title = "Pattern persists among party identifiers: 75% preferred someone else",
       caption = "* includes J Stein and G Johnson (total 4 responses).\nCandidate for Democrats is 70 HR Clinton and 5 D Trump.\nCandidate for Republicans is 2 HR Clinton and 48 D Trump.\nCandidate for Independents is 2 HR Clinton and 10 D Trump.") +
  theme_minimal()

f1abc <- grid.arrange(
  f1ab_fig,
  f1c_fig,
  nrow = 2)

# uncomment to save
# ggsave(f1abc, file = "f1-abc.pdf",
#        width = 11, height = 8)
# ggsave(f1abc, file = "f1-abc.png",
#        width = 11, height = 8)

# remove temporary data used for figures
rm(f1a_data, f1b_data, f1c_data,
   f1ab_data, f1ab_fig, f1c_fig, f1abc)

# Figure 2
f2_data <- us |> 
  filter(alter_category != "NAP") |> 
  drop_na(vote_labs) |>  
  mutate(alter_category = ifelse(alter_category == "Politician" | 
                                   alter_category == "Primary", 
                                 "Other politician/\nOther primary candidate*",
                                 alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Candidate", 
                                 "Either candidate\n(HRC/DT)", alter_category)) |> 
  mutate(alter_category = ifelse(alter_category == "Do not know" | 
                                   alter_category == "Personal" |
                                   alter_category == "Other" | 
                                   alter_category == "Deity/Fictional" |
                                   alter_category == "Nobody" | 
                                   alter_category == "Not candidates", 
                                 "Other (collapse)", alter_category)) |> 
  group_by(vote_labs) |> 
  mutate(pid_n = n()) |> 
  ungroup() |> 
  group_by(alter_category, vote_labs) |> 
  reframe(p = n()/unique(pid_n)) |> 
  arrange(desc(p))


f2_fig <- ggplot(f2_data, aes(x = reorder(alter_category, p), y = p, ymin = 0,
                                ymax = p)) +
  geom_linerange(linewidth = 1, colour = "grey70") + 
  geom_linerange(data = filter(f2_data, 
                               alter_category == "Either candidate\n(HRC/DT)"),
                 colour = "black", linewidth = 1) +
  geom_point(size = 1.5, colour = "grey70") +
  geom_label(data = filter(f2_data, 
                           alter_category == "Either candidate\n(HRC/DT)"),
             aes(label = round(100*p)), fill = "grey70") +
  coord_flip() +
  facet_wrap(~vote_labs, ncol = 2) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "", y = "",
       title = "75% of those who said will vote for either main candidate preferred someone else",
       caption = "* includes J Stein and G Johnson (total 4 responses).") +
  theme_minimal()

f2_fig

# uncomment to save
# ggsave(f2_fig, file = "f2.pdf",
#        width = 9, height = 5)
# ggsave(f2_fig, file = "f2.png",
#        width = 9, height = 5)

rm(f2_data, f2_fig)

# regression model (support for anyone else (1) vs primary candidate (0))
us <- us |> 
  mutate(alter_y = case_when(alter_category == "Candidate" ~ 0,
                             alter_category != "Candidate" ~ 1,
                             alter_category == "NAP" ~ NA),
         alter_y = as.numeric(alter_y))

m1 <- glm(alter_y ~ female + age + not_caucasian + edu_cat + relig + 
            interest + I(demrep - 4),
          data = us, family = binomial(link = "logit"))
summary(m1)

# uncomment to save
# wordreg(m1, file = "t1-alternatives.doc",
#                   custom.model.names = "Preferring alternative (=1)",
#                   digits = 3,
#                   custom.coef.names = c("Intercept",
#                                         "Female", "Age (in years)", 
#                                         "Not caucasian",
#                                         "Education (some college = 1)",
#                                         "Religiosity", "Political interest", 
#                                         "Strength of party ID"))

screenreg(m1, custom.model.names = "Preferring alternative (=1)",
                digits = 3,
                custom.coef.names = c("Intercept",
                                      "Female", "Age (in years)", "Not caucasian",
                                      "Education (some college = 1)",
                                      "Religiosity", "Political interest", 
                                      "Strength of party ID"))

rm(m1)

# First thing that comes to mind: terms -----------------------------------

# remove those who did not answer/Not Applicable in original data
us <- us |> 
  filter(open_clintontom_clean != "NAP") |> 
  filter(open_trumptom_clean   != "NAP")

# Term summary
trump <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_trumptom_clean) |> 
  ungroup() |> 
  count(word) |> 
  slice_max(n, n = 20) |> 
  mutate(word = str_replace_all(word, "ass", "a**")) |> # hide bad terms
  mutate(who = "Donald Trump")

clinton <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_clintontom_clean) |> 
  ungroup() |> 
  count(word) |> 
  slice_max(n, n = 20) |> 
  mutate(word = str_replace_all(word, "ass", "a**")) |> # hide bad terms
  mutate(word = str_replace_all(word, "bitch", "b***c")) |> # hide bad terms
  mutate(who = "Hillary Clinton")

# Figure 3
fig3_a <- trump |> 
  bind_rows(clinton) |> 
  mutate(word = reorder_within(word, n, who)) |> 
  ggplot(aes(x = reorder(word, n), y = n, ymin = 0, ymax = n)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap(~who, scales = "free") +
  scale_x_reordered() +
  labs(x = "", y = "",
       title = "Top 20 answers for first thing that comes to mind about",
       caption = "Raw counts.") +
  theme_minimal()

# term counts
trump <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_trumptom_clean) |> 
  ungroup() |> 
  count(word) |> 
  rename(n_trump = n)
nrow(trump)

clinton <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_clintontom_clean) |> 
  ungroup() |> 
  count(word) |> 
  rename(n_clinton = n)
nrow(clinton)


fig3_b <- inner_join(trump, clinton) |> 
  anti_join(stop_words, by = "word") |> 
  mutate(lr = log2(n_trump /n_clinton)) |> 
  mutate(word = str_replace_all(word, "ass\\b", "a**"),
         word = str_replace_all(word, "crap", "cr*p")) |> 
  ggplot(aes(x = reorder(word, lr), y = lr, ymin = 0, ymax = lr)) +  
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  labs(title = "Shared words and ratio", x = "", y = "",
       subtitle = "Log2 ratio of Donald Trump words/Hillary Clinton words") +
  theme_minimal()
  
fig3 <- grid.arrange(fig3_a, fig3_b, ncol = 2)

# uncomment to save
# ggsave(fig3, file = "f3-ab.pdf",
#        width = 11, height = 7)
# ggsave(fig3, file = "f3-ab.png",
#        width = 11, height = 7)

rm(fig3, fig3_a, fig3_b, clinton, trump)

# Figure 4 (grouped terms)
trump <- us |> 
  drop_na(dem_rep_labs) |> 
  group_by(dem_rep_labs) |> 
  unnest_tokens(word, open_trumptom_clean) |> 
  anti_join(stop_words, by = "word") |> 
  mutate(word = str_replace_all(word, 
                                "liar|corrupt|crooked|crook|untrustworthy|untruthful",
                                "dishonest")) |> 
  
  count(word) |> 
  mutate(who = "Top 20 first words that come to mind about Donald Trump") |>
  drop_na(dem_rep_labs) |> 
  group_by(dem_rep_labs) |> 
  slice_max(n, n = 20, with_ties = FALSE) |> 
  mutate(word = reorder_within(word, n, dem_rep_labs))

clinton <- us |> 
  drop_na(dem_rep_labs) |> 
  group_by(dem_rep_labs) |> 
  unnest_tokens(word, open_clintontom_clean) |> 
  anti_join(stop_words, by = "word") |> 
  mutate(word = str_replace_all(word, 
                                "liar|corrupt|crooked|crook|untrustworthy|untruthful",
                                "dishonest")) |> 
  count(word) |> 
  mutate(who = "Top 20 first words that come to mind about Hillary Clinton") |> 
  drop_na(dem_rep_labs) |> 
  group_by(dem_rep_labs) |> 
  slice_max(n, n = 20, with_ties = FALSE) |> 
  mutate(word = reorder_within(word, n, dem_rep_labs))

fig4_a <- 
  trump |> 
  mutate(word = str_replace_all(word, "asshole", "a**hole"),
         word = str_replace_all(word, "ass", "a**"),
         word = str_replace_all(word, "a\\*\\*ertive", "assertive")) |> 
  ggplot(aes(x = reorder(word, n), y = n, ymin = 0, ymax = n)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
    geom_point(size = 1.5, colour = "grey20") +
    coord_flip() +
    facet_wrap( ~ dem_rep_labs, scales = "free") +
    scale_x_reordered() +
  scale_y_continuous(label = scales::label_number(accuracy = 1)) +
  theme_minimal() +
  labs(title = "Top 20 first things that come to mind about Donald Trump",
       x = "", y = "")

fig4_b <- 
  clinton |> 
  mutate(word = str_replace_all(word, "bitch", "b**ch")) |>
  ggplot(aes(x = reorder(word, n), y = n, ymin = 0, ymax = n)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap( ~ dem_rep_labs, scales = "free") +
  scale_x_reordered() +
  scale_y_continuous(label = scales::label_number(accuracy = 1)) +
  theme_minimal() +
  labs(title = "Top 20 first things that come to mind about Hillary Clinton",
       x = "", y = "")


fig4 <- grid.arrange(fig4_a, fig4_b, ncol = 1)

# uncomment to save
# ggsave(fig4, file = "f4-ab.pdf",
#        width = 11, height = 7)
# ggsave(fig4, file = "f4-ab.png",
#        width = 11, height = 7)

rm(fig4, fig4_a, fig4_b, trump, clinton)


# Sentiment analysis ------------------------------------------------------
us <- us |> 
  filter(clinton_sent != "NAP") |> 
  filter(trump_sent   != "NAP")

# all data
full <- us |> 
  drop_na(dem_rep_labs) |> 
  mutate(dem_rep_labs = "Full sample") |> 
  group_by(dem_rep_labs) |> 
  summarise(trump_neg = sum(trump_sent == -1)/n(),
            trump_neutral = sum(trump_sent == 0)/n(),
            trump_pos = sum(trump_sent == 1)/n(),
            clinton_neg = sum(clinton_sent == -1)/n(),
            clinton_neutral = sum(clinton_sent == 0)/n(),
            clinton_pos = sum(clinton_sent == 1)/n())
# split for 
pid <- us |> 
  drop_na(dem_rep_labs) |> 
  group_by(dem_rep_labs) |> 
  summarise(trump_neg = sum(trump_sent == -1)/n(),
            trump_neutral = sum(trump_sent == 0)/n(),
            trump_pos = sum(trump_sent == 1)/n(),
            clinton_neg = sum(clinton_sent == -1)/n(),
            clinton_neutral = sum(clinton_sent == 0)/n(),
            clinton_pos = sum(clinton_sent == 1)/n())

# Figure 5
fig5 <- bind_rows(full, pid) |> 
  pivot_longer(cols = trump_neg:clinton_pos) |> 
  mutate(who  = ifelse(str_detect(name, "trump"), "Donald Trump", 
                      "Hillary Clinton"),
         what = ifelse(str_detect(name, "pos"), "Positive", 
                       ifelse(str_detect(name, "neg"), "Negative", 
                              "Neutral"))) |> 
  ggplot(aes(x = what, y = value, ymin = 0, ymax = value)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_grid(who ~ dem_rep_labs) +
  scale_y_continuous(label = scales::percent_format()) +
  theme_minimal() +
  labs(title = "Overwhelmingly negative sentiment about the two main candidates",
       x = "", y = "") +
  theme_minimal()

fig5

# uncomment to save
# ggsave(fig5, file = "f5-ab.pdf",
#        width = 10, height = 6)
# ggsave(fig5, file = "f5-ab.png",
#        width = 10, height = 6)

rm(fig5, full, pid)


# Figure 6
fig6a <- us |> 
  filter(clinton_sent == -1 | clinton_sent == 1) |> 
  filter(dem_rep_labs == "Self-reported Democrats" |
           dem_rep_labs == "Self-reported Republicans") |> 
  group_by(dem_rep_labs, clinton_sent) |> 
  unnest_tokens(word, open_clintontom_clean) |> 
  anti_join(stop_words, by = "word") |> 
  mutate(word = str_replace_all(word, 
                                "liar|corrupt|crooked|crook|untrustworthy|untruthful",
                                "dishonest")) |> 
  count(word) |> 
  group_by(dem_rep_labs, clinton_sent) |> 
  mutate(all_n = sum(n)) |> 
  slice_max(n/all_n, n = 5, with_ties = FALSE) |> 
  mutate(word = str_replace_all(word, "asshole", "a**hole"),
         word = str_replace_all(word, "ass", "a**"),
         word = str_replace_all(word, "a\\*\\*ertive", "assertive"),
         word = str_replace_all(word, "bitch", "b**ch")) |> 
  mutate(clinton_sent_lab = ifelse(clinton_sent == -1, "Negative", "Positive"),
         lab = paste0(dem_rep_labs, ": ", clinton_sent_lab, " (", all_n, ")")) |> 
  ggplot(aes(x = reorder(word, n/all_n), y = n/all_n, 
             ymin = 0, ymax = n/all_n)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap( ~ lab, scales = "free") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  labs(x = "", y = "", 
       title = "About Hillary Clinton")

fig6b <- us |> 
  filter(trump_sent == -1 | trump_sent == 1) |> 
  filter(dem_rep_labs == "Self-reported Democrats" |
           dem_rep_labs == "Self-reported Republicans") |> 
  group_by(dem_rep_labs, trump_sent) |> 
  unnest_tokens(word, open_trumptom_clean) |> 
  anti_join(stop_words, by = "word") |> 
  mutate(word = str_replace_all(word, 
                                "liar|corrupt|crooked|crook|untrustworthy|untruthful",
                                "dishonest")) |> 
  count(word) |> 
  group_by(dem_rep_labs, trump_sent) |> 
  mutate(all_n = sum(n)) |> 
  slice_max(n/all_n, n = 5, with_ties = FALSE) |> 
  mutate(word = str_replace_all(word, "asshole", "a**hole"),
         word = str_replace_all(word, "ass", "a**"),
         word = str_replace_all(word, "a\\*\\*ertive", "assertive")) |> 
  mutate(trump_sent_lab = ifelse(trump_sent == -1, "Negative", "Positive"),
         lab = paste0(dem_rep_labs, ": ", trump_sent_lab, " (", all_n, ")")) |> 
  ggplot(aes(x = reorder(word, n/all_n), y = n/all_n, 
             ymin = 0, ymax = n/all_n)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap( ~ lab, scales = "free") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  labs(x = "", y = "", 
       title = "About Donald Trump",
       caption = "Numbers in parentheses are the total count of words in the cell.")

fig6 <- grid.arrange(fig6a, fig6b, ncol = 1)

# uncomment to save
# ggsave(fig6, file = "f6-ab.pdf",
#        width = 10, height = 8)
# ggsave(fig6, file = "f6-ab.png",
#        width = 10, height = 8)

rm(fig6, fig6a, fig6b)

# split by Primaries
# creating Figures 7 and 8
# Democrats
dems <- us |> 
  drop_na(demprimary2016) |> 
  mutate(prime_lab = ifelse(demprimary2016 == 1, 
                            "Voted in primary\nfor Hillary Clinton (133)",
                            "Voted in primary for\nOther candidate (118)")) |> 
  mutate(vote_lab = ifelse(alter_category == "Candidate", "Hillary Clinton",
                           ifelse(alter_category == 
                                    "Primary", "Other primary candidate", 
                                  "Someone else")))

fig7a <- dems |> 
  group_by(prime_lab) |> 
  mutate(total = n()) |> 
  group_by(prime_lab, vote_lab) |> 
  summarise(p = n()/unique(total)) |> 
  ggplot(aes(x = vote_lab, y = p, 
           ymin = 0, ymax = p)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap( ~ prime_lab) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  labs(x = "", y = "", 
       title = "Open-ended president preferences")

fig7b <- dems |> 
  group_by(prime_lab) |> 
  mutate(total = n()) |> 
  group_by(prime_lab, clinton_sent) |>
  summarise(p = n()/unique(total)) |> 
  mutate(clinton_sent = ifelse(clinton_sent == -1, "Negative", 
                               ifelse(clinton_sent == 1, "Positive", "Neutral"))) |> 
  ggplot(aes(x = clinton_sent, y = p, 
             ymin = 0, ymax = p)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap( ~ prime_lab) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  labs(x = "", y = "", 
       title = "Sentiment towards Hillary Clinton")
  
fig7 <- grid.arrange(fig7a, fig7b, ncol = 2)  

# uncomment to save
# ggsave(fig7, file = "f7-ab.pdf",
#        width = 12, height = 6)
# ggsave(fig7, file = "f7-ab.png",
#        width = 12, height = 6)

rm(dems, fig7, fig7a, fig7b)

# Republicans
reps <- us |> 
  drop_na(repprimary2016) |> 
  mutate(prime_lab = ifelse(repprimary2016 == 16, 
                            "Voted in primary\nfor Donald Trump (109)",
                            "Voted in primary for\nOther candidate (105)")) |> 
  mutate(vote_lab = ifelse(alter_category == "Candidate", "Donald Trump",
                           ifelse(alter_category == 
                                    "Primary", "Other primary candidate", 
                                  "Someone else")))

fig8a <- reps |> 
  group_by(prime_lab) |> 
  mutate(total = n()) |> 
  group_by(prime_lab, vote_lab) |> 
  summarise(p = n()/unique(total)) |> 
  ggplot(aes(x = vote_lab, y = p, 
             ymin = 0, ymax = p)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap( ~ prime_lab) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  labs(x = "", y = "", 
       title = "Open-ended president preferences")

fig8b <- reps |> 
  group_by(prime_lab) |> 
  mutate(total = n()) |> 
  group_by(prime_lab, trump_sent) |>
  summarise(p = n()/unique(total)) |> 
  mutate(trump_sent = ifelse(trump_sent == -1, "Negative", 
                               ifelse(trump_sent == 1, "Positive", "Neutral"))) |> 
  ggplot(aes(x = trump_sent, y = p, 
             ymin = 0, ymax = p)) +
  geom_linerange(linewidth = 1, colour = "grey20") + 
  geom_point(size = 1.5, colour = "grey20") +
  coord_flip() +
  facet_wrap( ~ prime_lab) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  labs(x = "", y = "", 
       title = "Sentiment towards Donald Trump")

fig8 <- grid.arrange(fig8a, fig8b, ncol = 2)  

# uncomment to save
# ggsave(fig8, file = "f8-ab.pdf",
#        width = 12, height = 6)
# ggsave(fig8, file = "f8-ab.png",
#        width = 12, height = 6)

rm(reps, fig8, fig8a, fig8b)


# Regression models for sentiment -----------------------------------------
us <- us |> 
  mutate(clinton_sent_y = ifelse(clinton_sent == -1, 1, 0),
         trump_sent_y   = ifelse(trump_sent == -1, 1, 0))

m2 <- glm(clinton_sent_y ~ female + age + not_caucasian + edu_cat + relig + 
            interest + demrep,
          data = us, family = binomial(link = "logit"))
m3 <- glm(trump_sent_y ~ female + age + not_caucasian + edu_cat + relig + 
            interest + demrep,
          data = us, family = binomial(link = "logit"))

# uncomment to save
# wordreg(list(m2, m3), file = "t2-sentiment.doc",
#                 custom.model.names = c("Negative sentiment towards HRC",
#                                        "Negative sentiment towards DT"),
#                 digits = 3,
#                 custom.coef.names = c("Intercept",
#                                       "Female", "Age (in years)", 
#                                       "Not caucasian",
#                                       "Education (some college = 1)",
#                                       "Religiosity", "Political interest", 
#                                       "Party ID (D to R)"))

# Table 3
screenreg(list(m2, m3), custom.model.names = c("Negative sentiment towards HRC",
                                         "Negative sentiment towards DT"),
                  digits = 3,
                  custom.coef.names = c("Intercept",
                                        "Female", "Age (in years)", 
                                        "Not caucasian",
                                        "Education (some college = 1)",
                                        "Religiosity", "Political interest", 
                                        "Party ID (D to R)"))

rm(m2, m3)

# Sentiment coding validation (in text + table) --------------------------------
# discussed when introducing and reviewing human coding
# features in Table 2
clinton <- us |> 
  filter(open_clintontom_clean != "NAP") |> 
  group_by(open_clintontom_clean) |> 
  tally() |> 
  arrange(desc(n)) |> 
  ungroup() |> 
  rename(term = open_clintontom_clean)
trump <- us |> 
  filter(open_trumptom_clean != "NAP") |> 
  group_by(open_trumptom_clean) |> 
  tally() |> 
  arrange(desc(n)) |> 
  ungroup() |> 
  rename(term = open_trumptom_clean)

terms <- bind_rows(trump, clinton) |> 
  group_by(term) |> 
  summarise(n_total = sum(n)) |> 
  arrange(desc(n_total)) |> 
  mutate(term_length = str_count(term, '\\w+'))

# Dictionary 1
bing_sent <- get_sentiments("bing")

clinton <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_clintontom_clean) |> 
  left_join(bing_sent) |> 
  mutate(sentiment = ifelse(sentiment == "positive", 1, -1),
         sentiment = ifelse(is.na(sentiment), 0, sentiment),
         sentiment = as.numeric(sentiment)) |> 
  ungroup() |> 
  group_by(caseid) |> 
  summarise(bing_clinton = mean(sentiment)) |> 
  left_join(select(us, caseid, clinton_sent))

trump <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_trumptom_clean) |> 
  left_join(bing_sent) |> 
  mutate(sentiment = ifelse(sentiment == "positive", 1, -1),
         sentiment = ifelse(is.na(sentiment), 0, sentiment),
         sentiment = as.numeric(sentiment)) |> 
  ungroup() |> 
  group_by(caseid) |> 
  summarise(bing_trump = mean(sentiment)) |> 
  left_join(select(us, caseid, trump_sent))

clinton |> group_by(clinton_sent) |> 
  summarise(m = mean(bing_clinton))
trump |> group_by(trump_sent) |> 
  summarise(m = mean(bing_trump))

trump |> filter(trump_sent == 0 & bing_trump == 1) |> 
  left_join(select(us, caseid, open_trumptom_clean))

trump |> filter(trump_sent == 1 & bing_trump != 1) |> 
  left_join(select(us, caseid, open_trumptom_clean))

rm(clinton, trump)

# Dictionary 2
trump <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_trumptom_clean)

trump <- tokens_lookup(tokens(trump$word), 
              dictionary = data_dictionary_LSD2015, exclusive = FALSE) |> 
  unlist() |> 
  data.frame() |> 
  bind_cols(trump)
names(trump)[1] <- "trump_lexi"
trump <- trump |> 
  mutate(trump_lexi = ifelse(trump_lexi == "POSITIVE", 1,
                             ifelse(trump_lexi == "NEGATIVE", -1, 0)),
         trump_lexi = as.numeric(trump_lexi)) |> 
  group_by(caseid) |> 
  summarise(trump_lexi = mean(trump_lexi)) |> 
  left_join(select(us, caseid, trump_sent))

clinton <- us |> 
  group_by(caseid) |> 
  unnest_tokens(word, open_clintontom_clean)

clinton <- tokens_lookup(tokens(clinton$word), 
                       dictionary = data_dictionary_LSD2015, exclusive = FALSE) |> 
  unlist() |> 
  data.frame() |> 
  bind_cols(clinton)
names(clinton)[1] <- "clinton_lexi"
clinton <- clinton |> 
  mutate(clinton_lexi = ifelse(clinton_lexi == "POSITIVE", 1,
                             ifelse(clinton_lexi == "NEGATIVE", -1, 0)),
         clinton_lexi = as.numeric(clinton_lexi)) |> 
  group_by(caseid) |> 
  summarise(clinton_lexi = mean(clinton_lexi)) |> 
  left_join(select(us, caseid, clinton_sent))

clinton |> group_by(clinton_sent) |> 
  summarise(m = mean(clinton_lexi))

trump |> group_by(trump_sent) |> 
  summarise(m = mean(trump_lexi))

rm(trump, clinton)
rm(terms, us)

# sessionInfo()
